#include "opencl_source_map.hpp" 
namespace MNN { 
#ifndef MNN_OPENCL_BUFFER_CLOSED
#ifdef MNN_SUPPORT_INTEL_SUBGROUP
const char* conv_2d_c1_subgroup_buf = 
"#ifdef MNN_SUPPORT_FP16\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"#pragma OPENCL EXTENSION cl_intel_subgroups : enable\n"
"#ifdef MNN_SUPPORT_FP16\n"
"#define GROUP_READ(ptr,offset) as_half(intel_sub_group_block_read_us((const __global ushort*)(ptr)+(offset)))\n"
"#define GROUP_READ2(ptr,offset) as_half2(intel_sub_group_block_read_us2((const __global ushort*)(ptr)+(offset)))\n"
"#define GROUP_READ4(ptr,offset) as_half4(intel_sub_group_block_read_us4((const __global ushort*)(ptr)+(offset)))\n"
"#define GROUP_READ8(ptr,offset) as_half8(intel_sub_group_block_read_us8((const __global ushort*)(ptr)+(offset)))\n"
"#define GROUP_WRITE(ptr,offset,val) intel_sub_group_block_write_us((const __global ushort*)(ptr)+(offset),as_ushort(val))\n"
"#define GROUP_WRITE2(ptr,offset,val) intel_sub_group_block_write_us2((const __global ushort*)(ptr)+(offset),as_ushort2(val))\n"
"#define GROUP_WRITE4(ptr,offset,val) intel_sub_group_block_write_us4((const __global ushort*)(ptr)+(offset),as_ushort4(val))\n"
"#define GROUP_WRITE8(ptr,offset,val) intel_sub_group_block_write_us8((const __global ushort*)(ptr)+(offset),as_ushort8(val))\n"
"#define GROUP_SHUFFLE(data,id) as_half(intel_sub_group_shuffle(as_ushort(data),id))\n"
"#define GROUP_SHUFFLE2(data,id) as_half2(intel_sub_group_shuffle(as_ushort2(data),id))\n"
"#define GROUP_SHUFFLE4(data,id) as_half4(intel_sub_group_shuffle(as_ushort4(data),id))\n"
"#define GROUP_SHUFFLE8(data,id) as_half8(intel_sub_group_shuffle(as_ushort8(data),id))\n"
"#else\n"
"#define GROUP_READ(ptr,offset) as_float(intel_sub_group_block_read((const __global uint*)(ptr)+(offset)))\n"
"#define GROUP_READ2(ptr,offset) as_float2(intel_sub_group_block_read2((const __global uint*)(ptr)+(offset)))\n"
"#define GROUP_READ4(ptr,offset) as_float4(intel_sub_group_block_read4((const __global uint*)(ptr)+(offset)))\n"
"#define GROUP_READ8(ptr,offset) as_float8(intel_sub_group_block_read8((const __global uint*)(ptr)+(offset)))\n"
"#define GROUP_WRITE(ptr,offset,val) intel_sub_group_block_write((const __global uint*)(ptr)+(offset),as_uint(val))\n"
"#define GROUP_WRITE2(ptr,offset,val) intel_sub_group_block_write2((const __global uint*)(ptr)+(offset),as_uint2(val))\n"
"#define GROUP_WRITE4(ptr,offset,val) intel_sub_group_block_write4((const __global uint*)(ptr)+(offset),as_uint4(val))\n"
"#define GROUP_WRITE8(ptr,offset,val) intel_sub_group_block_write8((const __global uint*)(ptr)+(offset),as_uint8(val))\n"
"#define GROUP_SHUFFLE(data,id) intel_sub_group_shuffle(data,id)\n"
"#define GROUP_SHUFFLE2(data,id) intel_sub_group_shuffle(data,id)\n"
"#define GROUP_SHUFFLE4(data,id) intel_sub_group_shuffle(data,id)\n"
"#define GROUP_SHUFFLE8(data,id) intel_sub_group_shuffle(data,id)\n"
"#endif\n"
"__attribute__((intel_reqd_sub_group_size(16)))\n"
"__kernel void conv_2d_buf_subgroup_c1_c4_b2(\n"
" __global FLOAT* input,\n"
" __global FLOAT* output,\n"
" __global FLOAT* weights,\n"
" __global FLOAT* biases,\n"
" __private const int pad_width,\n"
" __private const int pad_height,\n"
" __private const int input_width,\n"
" __private const int input_height,\n"
" __private const int output_width,\n"
" __private const int output_height,\n"
" __private const int output_channel,\n"
" __private const int batch,\n"
" __private const int x_blocks,\n"
" __private const int input_pad_left,\n"
" __private const int input_pad_right,\n"
" __private const int output_pad_left,\n"
" __private const int output_pad_right\n"
")\n"
"{\n"
" const int f_block=get_group_id(1);\n"
" const int lid=get_sub_group_local_id();\n"
" const int b=get_global_id(2);\n"
" const int xy=get_global_id(0);\n"
" const int x=(xy % x_blocks) << 1;\n"
" const int y=(xy/x_blocks);\n"
" const int input_x=x*STRIDE_WIDTH-pad_width;\n"
" const int input_y=y*STRIDE_HEIGHT-pad_height;\n"
" const uint input_x_pitch=1;\n"
" const uint input_y_pitch=input_x_pitch*input_width;\n"
" const uint input_f_pitch=input_y_pitch*input_height;\n"
" const uint input_b_pitch=input_f_pitch*INPUT_CHANNEL;\n"
" const uint input_offset=b*input_b_pitch +\n"
" input_y*input_y_pitch +\n"
" input_x*input_x_pitch;\n"
" const uint output_pack=(output_channel+3)/4;\n"
" const uint output_x_pitch=4;\n"
" const uint output_y_pitch=output_x_pitch*output_width;\n"
" const uint output_fs_pitch=output_y_pitch*output_height;\n"
" const uint output_b_pitch=output_fs_pitch*batch;\n"
" \n"
" \n"
" const uint output_offset=b*output_fs_pitch +\n"
" f_block*4*output_b_pitch +\n"
" y*output_y_pitch +\n"
" x*output_x_pitch;\n"
" const uint filter_isv_pitch=16;\n"
" const uint filter_x_pitch=256;\n"
" const uint filter_y_pitch=filter_x_pitch*FILTER_WIDTH;\n"
" const uint filter_is_pitch=filter_y_pitch*FILTER_HEIGHT;\n"
" const uint filter_os_pitch=filter_is_pitch*((INPUT_CHANNEL+15)/16);\n"
" const uint filter_offset=f_block*filter_os_pitch;\n"
" uint bias_offset=f_block*16;\n"
" COMPUTE_FLOAT2 dst=(COMPUTE_FLOAT2)(GROUP_READ(biases,bias_offset));\n"
" \n"
" FLOAT line_cache[INPUT_CHANNEL*INPUT_BLOCK_SIZE];\n"
" for (int ic=0; ic<INPUT_CHANNEL; ic++)\n"
" {\n"
" __attribute__((opencl_unroll_hint(INPUT_BLOCK_SIZE)))\n"
" for (int i=0; i<INPUT_BLOCK_SIZE; i++)\n"
" {\n"
" const int in_elem=i*16+lid;\n"
" const int xb=in_elem % INPUT_LINE_SIZE;\n"
" const int yb=in_elem/INPUT_LINE_SIZE;\n"
" if (input_y+yb >= 0 && input_y+yb<input_height &&\n"
" input_x+xb >= 0 && input_x+xb<input_width)\n"
" line_cache[ic*INPUT_BLOCK_SIZE+i]=input[input_offset +\n"
" ic*input_f_pitch +\n"
" xb*input_x_pitch +\n"
" yb*input_y_pitch];\n"
" else\n"
" line_cache[ic*INPUT_BLOCK_SIZE+i]=0;\n"
" }\n"
" }\n"
" __attribute__((opencl_unroll_hint(FILTER_HEIGHT)))\n"
" for (int kh=0; kh<FILTER_HEIGHT; kh++)\n"
" {\n"
" __attribute__((opencl_unroll_hint(FILTER_WIDTH)))\n"
" for (int kw=0; kw<FILTER_WIDTH; kw++)\n"
" {\n"
" uint offset=filter_offset+kh*filter_y_pitch+kw*filter_x_pitch;\n"
" \n"
" COMPUTE_FLOAT wei[INPUT_CHANNEL];\n"
" __attribute__((opencl_unroll_hint(INPUT_CHANNEL)))\n"
" for (int ic=0; ic<INPUT_CHANNEL; ic++)\n"
" wei[ic]=GROUP_READ(weights,offset+ic*filter_isv_pitch);\n"
" \n"
" __attribute__((opencl_unroll_hint(2)))\n"
" for (int i=0; i<2; i++)\n"
" {\n"
" const uint buf_offset=(kw*DILATION_WIDTH+STRIDE_WIDTH*i+(kh*DILATION_HEIGHT)*INPUT_LINE_SIZE)/16;\n"
" const uint buf_group=(kw*DILATION_WIDTH+STRIDE_WIDTH*i+(kh*DILATION_HEIGHT)*INPUT_LINE_SIZE) % 16;\n"
" \n"
" for (int ic=0; ic<INPUT_CHANNEL; ic++) {\n"
" COMPUTE_FLOAT src=GROUP_SHUFFLE(line_cache[ic*INPUT_BLOCK_SIZE+buf_offset],buf_group);\n"
" dst[i]=mad(wei[ic],src,dst[i]);\n"
" }\n"
" }\n"
" }\n"
" }\n"
"#ifdef RELU\n"
" dst=fmax(dst,(COMPUTE_FLOAT2)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" dst=clamp(dst,(COMPUTE_FLOAT2)0,(COMPUTE_FLOAT2)6);\n"
"#endif\n"
" const uint lid_x=lid % 4;\n"
" const uint lid_y=lid/4;\n"
" if ((f_block+1)*16 >= output_channel) {\n"
" for (int i=0; i<2 && (x+i)<output_width; i++) {\n"
" if ((f_block*16+lid_y*4<output_pack*4))\n"
" output[output_offset+lid_y*output_b_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
" }\n"
" }\n"
" else\n"
" {\n"
" for (int i=0; i<2 && (x+i)<output_width; i++) {\n"
" output[output_offset+lid_y*output_b_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
" }\n"
" }\n"
"}\n"
"__attribute__((intel_reqd_sub_group_size(16)))\n"
"__kernel void conv_2d_buf_subgroup_c1_c4_b4(\n"
" __global FLOAT* input,\n"
" __global FLOAT* output,\n"
" __global FLOAT* weights,\n"
" __global FLOAT* biases,\n"
" __private const int pad_width,\n"
" __private const int pad_height,\n"
" __private const int input_width,\n"
" __private const int input_height,\n"
" __private const int output_width,\n"
" __private const int output_height,\n"
" __private const int output_channel,\n"
" __private const int batch,\n"
" __private const int x_blocks,\n"
" __private const int input_pad_left,\n"
" __private const int input_pad_right,\n"
" __private const int output_pad_left,\n"
" __private const int output_pad_right\n"
")\n"
"{\n"
" const int f_block=get_group_id(1);\n"
" const int lid=get_sub_group_local_id();\n"
" const int b=get_global_id(2);\n"
" const int xy=get_global_id(0);\n"
" const int x=(xy % x_blocks) << 2;\n"
" const int y=(xy/x_blocks);\n"
" const int input_x=x*STRIDE_WIDTH-pad_width;\n"
" const int input_y=y*STRIDE_HEIGHT-pad_height;\n"
" const uint input_x_pitch=1;\n"
" const uint input_y_pitch=input_x_pitch*input_width;\n"
" const uint input_f_pitch=input_y_pitch*input_height;\n"
" const uint input_b_pitch=input_f_pitch*INPUT_CHANNEL;\n"
" const uint input_offset=b*input_b_pitch +\n"
" input_y*input_y_pitch +\n"
" input_x*input_x_pitch;\n"
" const uint output_pack=(output_channel+3)/4;\n"
" const uint output_x_pitch=4;\n"
" const uint output_y_pitch=output_x_pitch*output_width;\n"
" const uint output_fs_pitch=output_y_pitch*output_height;\n"
" const uint output_b_pitch=output_fs_pitch*batch;\n"
" \n"
" \n"
" const uint output_offset=b*output_fs_pitch +\n"
" f_block*4*output_b_pitch +\n"
" y*output_y_pitch +\n"
" x*output_x_pitch;\n"
" const uint filter_isv_pitch=16;\n"
" const uint filter_x_pitch=256;\n"
" const uint filter_y_pitch=filter_x_pitch*FILTER_WIDTH;\n"
" const uint filter_is_pitch=filter_y_pitch*FILTER_HEIGHT;\n"
" const uint filter_os_pitch=filter_is_pitch*((INPUT_CHANNEL+15)/16);\n"
" const uint filter_offset=f_block*filter_os_pitch;\n"
" uint bias_offset=f_block*16;\n"
" COMPUTE_FLOAT4 dst=(COMPUTE_FLOAT4)(GROUP_READ(biases,bias_offset));\n"
" \n"
" FLOAT line_cache[INPUT_CHANNEL*INPUT_BLOCK_SIZE];\n"
" for (int ic=0; ic<INPUT_CHANNEL; ic++)\n"
" {\n"
" __attribute__((opencl_unroll_hint(INPUT_BLOCK_SIZE)))\n"
" for (int i=0; i<INPUT_BLOCK_SIZE; i++)\n"
" {\n"
" const int in_elem=i*16+lid;\n"
" const int xb=in_elem % INPUT_LINE_SIZE;\n"
" const int yb=in_elem/INPUT_LINE_SIZE;\n"
" if (input_y+yb >= 0 && input_y+yb<input_height &&\n"
" input_x+xb >= 0 && input_x+xb<input_width)\n"
" line_cache[ic*INPUT_BLOCK_SIZE+i]=input[input_offset +\n"
" ic*input_f_pitch +\n"
" xb*input_x_pitch +\n"
" yb*input_y_pitch];\n"
" else\n"
" line_cache[ic*INPUT_BLOCK_SIZE+i]=0;\n"
" }\n"
" }\n"
" __attribute__((opencl_unroll_hint(FILTER_HEIGHT)))\n"
" for (int kh=0; kh<FILTER_HEIGHT; kh++)\n"
" {\n"
" __attribute__((opencl_unroll_hint(FILTER_WIDTH)))\n"
" for (int kw=0; kw<FILTER_WIDTH; kw++)\n"
" {\n"
" uint offset=filter_offset+kh*filter_y_pitch+kw*filter_x_pitch;\n"
" \n"
" COMPUTE_FLOAT wei[INPUT_CHANNEL];\n"
" __attribute__((opencl_unroll_hint(INPUT_CHANNEL)))\n"
" for (int ic=0; ic<INPUT_CHANNEL; ic++)\n"
" wei[ic]=GROUP_READ(weights,offset+ic*filter_isv_pitch);\n"
" \n"
" __attribute__((opencl_unroll_hint(4)))\n"
" for (int i=0; i<4; i++)\n"
" {\n"
" const uint buf_offset=(kw*DILATION_WIDTH+STRIDE_WIDTH*i+(kh*DILATION_HEIGHT)*INPUT_LINE_SIZE)/16;\n"
" const uint buf_group=(kw*DILATION_WIDTH+STRIDE_WIDTH*i+(kh*DILATION_HEIGHT)*INPUT_LINE_SIZE) % 16;\n"
" \n"
" for (int ic=0; ic<INPUT_CHANNEL; ic++) {\n"
" COMPUTE_FLOAT src=GROUP_SHUFFLE(line_cache[ic*INPUT_BLOCK_SIZE+buf_offset],buf_group);\n"
" dst[i]=mad(wei[ic],src,dst[i]);\n"
" }\n"
" }\n"
" }\n"
" }\n"
"#ifdef RELU\n"
" dst=fmax(dst,(COMPUTE_FLOAT4)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" dst=clamp(dst,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
"#endif\n"
" const uint lid_x=lid % 4;\n"
" const uint lid_y=lid/4;\n"
" if ((f_block+1)*16 >= output_channel) {\n"
" for (int i=0; i<4 && (x+i)<output_width; i++) {\n"
" if ((f_block*16+lid_y*4<output_pack*4))\n"
" output[output_offset+lid_y*output_b_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
" }\n"
" }\n"
" else\n"
" {\n"
" for (int i=0; i<4 && (x+i)<output_width; i++) {\n"
" output[output_offset+lid_y*output_b_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
" }\n"
" }\n"
"}\n"
"__attribute__((intel_reqd_sub_group_size(16)))\n"
"__kernel void conv_2d_buf_subgroup_c1_c4_b8(\n"
" __global FLOAT* input,\n"
" __global FLOAT* output,\n"
" __global FLOAT* weights,\n"
" __global FLOAT* biases,\n"
" __private const int pad_width,\n"
" __private const int pad_height,\n"
" __private const int input_width,\n"
" __private const int input_height,\n"
" __private const int output_width,\n"
" __private const int output_height,\n"
" __private const int output_channel,\n"
" __private const int batch,\n"
" __private const int x_blocks,\n"
" __private const int input_pad_left,\n"
" __private const int input_pad_right,\n"
" __private const int output_pad_left,\n"
" __private const int output_pad_right\n"
")\n"
"{\n"
" const int f_block=get_group_id(1);\n"
" const int lid=get_sub_group_local_id();\n"
" const int b=get_global_id(2);\n"
" const int xy=get_global_id(0);\n"
" const int x=(xy % x_blocks) << 3;\n"
" const int y=(xy/x_blocks);\n"
" const int input_x=x*STRIDE_WIDTH-pad_width;\n"
" const int input_y=y*STRIDE_HEIGHT-pad_height;\n"
" const uint input_x_pitch=1;\n"
" const uint input_y_pitch=input_x_pitch*input_width;\n"
" const uint input_f_pitch=input_y_pitch*input_height;\n"
" const uint input_b_pitch=input_f_pitch*INPUT_CHANNEL;\n"
" const uint input_offset=b*input_b_pitch +\n"
" input_y*input_y_pitch +\n"
" input_x*input_x_pitch;\n"
" const uint output_pack=(output_channel+3)/4;\n"
" const uint output_x_pitch=4;\n"
" const uint output_y_pitch=output_x_pitch*output_width;\n"
" const uint output_fs_pitch=output_y_pitch*output_height;\n"
" const uint output_b_pitch=output_fs_pitch*batch;\n"
" \n"
" \n"
" const uint output_offset=b*output_fs_pitch +\n"
" f_block*4*output_b_pitch +\n"
" y*output_y_pitch +\n"
" x*output_x_pitch;\n"
" const uint filter_isv_pitch=16;\n"
" const uint filter_x_pitch=256;\n"
" const uint filter_y_pitch=filter_x_pitch*FILTER_WIDTH;\n"
" const uint filter_is_pitch=filter_y_pitch*FILTER_HEIGHT;\n"
" const uint filter_os_pitch=filter_is_pitch*((INPUT_CHANNEL+15)/16);\n"
" const uint filter_offset=f_block*filter_os_pitch;\n"
" uint bias_offset=f_block*16;\n"
" COMPUTE_FLOAT8 dst=(COMPUTE_FLOAT8)(GROUP_READ(biases,bias_offset));\n"
" \n"
" FLOAT line_cache[INPUT_CHANNEL*INPUT_BLOCK_SIZE];\n"
" for (int ic=0; ic<INPUT_CHANNEL; ic++)\n"
" {\n"
" __attribute__((opencl_unroll_hint(INPUT_BLOCK_SIZE)))\n"
" for (int i=0; i<INPUT_BLOCK_SIZE; i++)\n"
" {\n"
" const int in_elem=i*16+lid;\n"
" const int xb=in_elem % INPUT_LINE_SIZE;\n"
" const int yb=in_elem/INPUT_LINE_SIZE;\n"
" if (input_y+yb >= 0 && input_y+yb<input_height &&\n"
" input_x+xb >= 0 && input_x+xb<input_width)\n"
" line_cache[ic*INPUT_BLOCK_SIZE+i]=input[input_offset +\n"
" ic*input_f_pitch +\n"
" xb*input_x_pitch +\n"
" yb*input_y_pitch];\n"
" else\n"
" line_cache[ic*INPUT_BLOCK_SIZE+i]=0;\n"
" }\n"
" }\n"
" __attribute__((opencl_unroll_hint(FILTER_HEIGHT)))\n"
" for (int kh=0; kh<FILTER_HEIGHT; kh++)\n"
" {\n"
" __attribute__((opencl_unroll_hint(FILTER_WIDTH)))\n"
" for (int kw=0; kw<FILTER_WIDTH; kw++)\n"
" {\n"
" uint offset=filter_offset+kh*filter_y_pitch+kw*filter_x_pitch;\n"
" \n"
" COMPUTE_FLOAT wei[INPUT_CHANNEL];\n"
" __attribute__((opencl_unroll_hint(INPUT_CHANNEL)))\n"
" for (int ic=0; ic<INPUT_CHANNEL; ic++)\n"
" wei[ic]=GROUP_READ(weights,offset+ic*filter_isv_pitch);\n"
" \n"
" __attribute__((opencl_unroll_hint(8)))\n"
" for (int i=0; i<8; i++)\n"
" {\n"
" const uint buf_offset=(kw*DILATION_WIDTH+STRIDE_WIDTH*i+(kh*DILATION_HEIGHT)*INPUT_LINE_SIZE)/16;\n"
" const uint buf_group=(kw*DILATION_WIDTH+STRIDE_WIDTH*i+(kh*DILATION_HEIGHT)*INPUT_LINE_SIZE) % 16;\n"
" \n"
" for (int ic=0; ic<INPUT_CHANNEL; ic++) {\n"
" COMPUTE_FLOAT src=GROUP_SHUFFLE(line_cache[ic*INPUT_BLOCK_SIZE+buf_offset],buf_group);\n"
" dst[i]=mad(wei[ic],src,dst[i]);\n"
" }\n"
" }\n"
" }\n"
" }\n"
"#ifdef RELU\n"
" dst=fmax(dst,(COMPUTE_FLOAT8)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" dst=clamp(dst,(COMPUTE_FLOAT8)0,(COMPUTE_FLOAT8)6);\n"
"#endif\n"
" const uint lid_x=lid % 4;\n"
" const uint lid_y=lid/4;\n"
" if ((f_block+1)*16 >= output_channel) {\n"
" for (int i=0; i<8 && (x+i)<output_width; i++) {\n"
" if ((f_block*16+lid_y*4<output_pack*4))\n"
" output[output_offset+lid_y*output_b_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
" }\n"
" }\n"
" else\n"
" {\n"
" for (int i=0; i<8 && (x+i)<output_width; i++) {\n"
" output[output_offset+lid_y*output_b_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
" }\n"
" }\n"
"}\n"
"__attribute__((intel_reqd_sub_group_size(16)))\n"
"__kernel void conv_2d_buf_subgroup_c1_c16_b2(\n"
" __global FLOAT* input,\n"
" __global FLOAT* output,\n"
" __global FLOAT* weights,\n"
" __global FLOAT* biases,\n"
" __private const int pad_width,\n"
" __private const int pad_height,\n"
" __private const int input_width,\n"
" __private const int input_height,\n"
" __private const int output_width,\n"
" __private const int output_height,\n"
" __private const int output_channel,\n"
" __private const int batch,\n"
" __private const int x_blocks,\n"
" __private const int input_pad_left,\n"
" __private const int input_pad_right,\n"
" __private const int output_pad_left,\n"
" __private const int output_pad_right\n"
")\n"
"{\n"
" const int f_block=get_group_id(1);\n"
" const int lid=get_sub_group_local_id();\n"
" const int b=get_global_id(2);\n"
" const int xy=get_global_id(0);\n"
" const int x=(xy % x_blocks) << 1;\n"
" const int y=(xy/x_blocks);\n"
" const int input_x=x*STRIDE_WIDTH-pad_width;\n"
" const int input_y=y*STRIDE_HEIGHT-pad_height;\n"
" const uint input_x_pitch=1;\n"
" const uint input_y_pitch=input_x_pitch*input_width;\n"
" const uint input_f_pitch=input_y_pitch*input_height;\n"
" const uint input_b_pitch=input_f_pitch*INPUT_CHANNEL;\n"
" const uint input_offset=b*input_b_pitch +\n"
" input_y*input_y_pitch +\n"
" input_x*input_x_pitch;\n"
" const uint output_x_pitch=16;\n"
" const uint output_y_pitch=output_x_pitch*(output_pad_left+output_width+output_pad_right);\n"
" const uint output_fs_pitch=output_y_pitch*output_height;\n"
" const uint output_b_pitch=output_fs_pitch*((output_channel+15)/16);\n"
" \n"
" \n"
" const uint output_offset=b*output_b_pitch +\n"
" f_block*output_fs_pitch +\n"
" y*output_y_pitch +\n"
" (x+output_pad_left)*output_x_pitch;\n"
" const uint filter_isv_pitch=16;\n"
" const uint filter_x_pitch=256;\n"
" const uint filter_y_pitch=filter_x_pitch*FILTER_WIDTH;\n"
" const uint filter_is_pitch=filter_y_pitch*FILTER_HEIGHT;\n"
" const uint filter_os_pitch=filter_is_pitch*((INPUT_CHANNEL+15)/16);\n"
" const uint filter_offset=f_block*filter_os_pitch;\n"
" uint bias_offset=f_block*16;\n"
" COMPUTE_FLOAT2 dst=(COMPUTE_FLOAT2)(GROUP_READ(biases,bias_offset));\n"
" \n"
" FLOAT line_cache[INPUT_CHANNEL*INPUT_BLOCK_SIZE];\n"
" for (int ic=0; ic<INPUT_CHANNEL; ic++)\n"
" {\n"
" __attribute__((opencl_unroll_hint(INPUT_BLOCK_SIZE)))\n"
" for (int i=0; i<INPUT_BLOCK_SIZE; i++)\n"
" {\n"
" const int in_elem=i*16+lid;\n"
" const int xb=in_elem % INPUT_LINE_SIZE;\n"
" const int yb=in_elem/INPUT_LINE_SIZE;\n"
" if (input_y+yb >= 0 && input_y+yb<input_height &&\n"
" input_x+xb >= 0 && input_x+xb<input_width)\n"
" line_cache[ic*INPUT_BLOCK_SIZE+i]=input[input_offset +\n"
" ic*input_f_pitch +\n"
" xb*input_x_pitch +\n"
" yb*input_y_pitch];\n"
" else\n"
" line_cache[ic*INPUT_BLOCK_SIZE+i]=0;\n"
" }\n"
" }\n"
" __attribute__((opencl_unroll_hint(FILTER_HEIGHT)))\n"
" for (int kh=0; kh<FILTER_HEIGHT; kh++)\n"
" {\n"
" __attribute__((opencl_unroll_hint(FILTER_WIDTH)))\n"
" for (int kw=0; kw<FILTER_WIDTH; kw++)\n"
" {\n"
" uint offset=filter_offset+kh*filter_y_pitch+kw*filter_x_pitch;\n"
" \n"
" COMPUTE_FLOAT wei[INPUT_CHANNEL];\n"
" __attribute__((opencl_unroll_hint(INPUT_CHANNEL)))\n"
" for (int ic=0; ic<INPUT_CHANNEL; ic++)\n"
" wei[ic]=GROUP_READ(weights,offset+ic*filter_isv_pitch);\n"
" \n"
" __attribute__((opencl_unroll_hint(2)))\n"
" for (int i=0; i<2; i++)\n"
" {\n"
" const uint buf_offset=(kw*DILATION_WIDTH+STRIDE_WIDTH*i+(kh*DILATION_HEIGHT)*INPUT_LINE_SIZE)/16;\n"
" const uint buf_group=(kw*DILATION_WIDTH+STRIDE_WIDTH*i+(kh*DILATION_HEIGHT)*INPUT_LINE_SIZE) % 16;\n"
" \n"
" for (int ic=0; ic<INPUT_CHANNEL; ic++) {\n"
" COMPUTE_FLOAT src=GROUP_SHUFFLE(line_cache[ic*INPUT_BLOCK_SIZE+buf_offset],buf_group);\n"
" dst[i]=mad(wei[ic],src,dst[i]);\n"
" }\n"
" }\n"
" }\n"
" }\n"
"#ifdef RELU\n"
" dst=fmax(dst,(COMPUTE_FLOAT2)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" dst=clamp(dst,(COMPUTE_FLOAT2)0,(COMPUTE_FLOAT2)6);\n"
"#endif\n"
" if(x == 0){\n"
" uint pad_offset=b*output_b_pitch+f_block*output_fs_pitch+y*output_y_pitch;\n"
" for(int i=0; i<output_pad_left; ++i){\n"
" output[pad_offset+i*output_x_pitch+lid]=0;\n"
" }\n"
" pad_offset += (output_width+output_pad_left)*output_x_pitch;\n"
" for(int i=0; i<output_pad_right; ++i){\n"
" output[pad_offset+i*output_x_pitch+lid]=0;\n"
" }\n"
" }\n"
" if ((f_block+1)*16 >= output_channel) {\n"
" for (int i=0; i<2; i++) {\n"
" if ((f_block*16+lid<output_channel) && (x+i)<output_width)\n"
" output[output_offset+i*output_x_pitch+lid]=(FLOAT)dst[i];\n"
" }\n"
" }\n"
" else\n"
" {\n"
" if (x+2 <= output_width || output_width % 2 == 0) {\n"
" GROUP_WRITE2(output,output_offset,CONVERT_FLOAT2(dst));\n"
" }else{\n"
" for (int i=0; i<output_width % 2; i++) {\n"
" output[output_offset+i*output_x_pitch+lid]=(FLOAT)dst[i];\n"
" }\n"
" }\n"
" }\n"
"}\n"
"__attribute__((intel_reqd_sub_group_size(16)))\n"
"__kernel void conv_2d_buf_subgroup_c1_c16_b4(\n"
" __global FLOAT* input,\n"
" __global FLOAT* output,\n"
" __global FLOAT* weights,\n"
" __global FLOAT* biases,\n"
" __private const int pad_width,\n"
" __private const int pad_height,\n"
" __private const int input_width,\n"
" __private const int input_height,\n"
" __private const int output_width,\n"
" __private const int output_height,\n"
" __private const int output_channel,\n"
" __private const int batch,\n"
" __private const int x_blocks,\n"
" __private const int input_pad_left,\n"
" __private const int input_pad_right,\n"
" __private const int output_pad_left,\n"
" __private const int output_pad_right\n"
")\n"
"{\n"
" const int f_block=get_group_id(1);\n"
" const int lid=get_sub_group_local_id();\n"
" const int b=get_global_id(2);\n"
" const int xy=get_global_id(0);\n"
" const int x=(xy % x_blocks) << 2;\n"
" const int y=(xy/x_blocks);\n"
" const int input_x=x*STRIDE_WIDTH-pad_width;\n"
" const int input_y=y*STRIDE_HEIGHT-pad_height;\n"
" const uint input_x_pitch=1;\n"
" const uint input_y_pitch=input_x_pitch*input_width;\n"
" const uint input_f_pitch=input_y_pitch*input_height;\n"
" const uint input_b_pitch=input_f_pitch*INPUT_CHANNEL;\n"
" const uint input_offset=b*input_b_pitch +\n"
" input_y*input_y_pitch +\n"
" input_x*input_x_pitch;\n"
" const uint output_x_pitch=16;\n"
" const uint output_y_pitch=output_x_pitch*(output_pad_left+output_width+output_pad_right);\n"
" const uint output_fs_pitch=output_y_pitch*output_height;\n"
" const uint output_b_pitch=output_fs_pitch*((output_channel+15)/16);\n"
" \n"
" \n"
" const uint output_offset=b*output_b_pitch +\n"
" f_block*output_fs_pitch +\n"
" y*output_y_pitch +\n"
" (x+output_pad_left)*output_x_pitch;\n"
" const uint filter_isv_pitch=16;\n"
" const uint filter_x_pitch=256;\n"
" const uint filter_y_pitch=filter_x_pitch*FILTER_WIDTH;\n"
" const uint filter_is_pitch=filter_y_pitch*FILTER_HEIGHT;\n"
" const uint filter_os_pitch=filter_is_pitch*((INPUT_CHANNEL+15)/16);\n"
" const uint filter_offset=f_block*filter_os_pitch;\n"
" uint bias_offset=f_block*16;\n"
" COMPUTE_FLOAT4 dst=(COMPUTE_FLOAT4)(GROUP_READ(biases,bias_offset));\n"
" \n"
" FLOAT line_cache[INPUT_CHANNEL*INPUT_BLOCK_SIZE];\n"
" for (int ic=0; ic<INPUT_CHANNEL; ic++)\n"
" {\n"
" __attribute__((opencl_unroll_hint(INPUT_BLOCK_SIZE)))\n"
" for (int i=0; i<INPUT_BLOCK_SIZE; i++)\n"
" {\n"
" const int in_elem=i*16+lid;\n"
" const int xb=in_elem % INPUT_LINE_SIZE;\n"
" const int yb=in_elem/INPUT_LINE_SIZE;\n"
" if (input_y+yb >= 0 && input_y+yb<input_height &&\n"
" input_x+xb >= 0 && input_x+xb<input_width)\n"
" line_cache[ic*INPUT_BLOCK_SIZE+i]=input[input_offset +\n"
" ic*input_f_pitch +\n"
" xb*input_x_pitch +\n"
" yb*input_y_pitch];\n"
" else\n"
" line_cache[ic*INPUT_BLOCK_SIZE+i]=0;\n"
" }\n"
" }\n"
" __attribute__((opencl_unroll_hint(FILTER_HEIGHT)))\n"
" for (int kh=0; kh<FILTER_HEIGHT; kh++)\n"
" {\n"
" __attribute__((opencl_unroll_hint(FILTER_WIDTH)))\n"
" for (int kw=0; kw<FILTER_WIDTH; kw++)\n"
" {\n"
" uint offset=filter_offset+kh*filter_y_pitch+kw*filter_x_pitch;\n"
" \n"
" COMPUTE_FLOAT wei[INPUT_CHANNEL];\n"
" __attribute__((opencl_unroll_hint(INPUT_CHANNEL)))\n"
" for (int ic=0; ic<INPUT_CHANNEL; ic++)\n"
" wei[ic]=GROUP_READ(weights,offset+ic*filter_isv_pitch);\n"
" \n"
" __attribute__((opencl_unroll_hint(4)))\n"
" for (int i=0; i<4; i++)\n"
" {\n"
" const uint buf_offset=(kw*DILATION_WIDTH+STRIDE_WIDTH*i+(kh*DILATION_HEIGHT)*INPUT_LINE_SIZE)/16;\n"
" const uint buf_group=(kw*DILATION_WIDTH+STRIDE_WIDTH*i+(kh*DILATION_HEIGHT)*INPUT_LINE_SIZE) % 16;\n"
" \n"
" for (int ic=0; ic<INPUT_CHANNEL; ic++) {\n"
" COMPUTE_FLOAT src=GROUP_SHUFFLE(line_cache[ic*INPUT_BLOCK_SIZE+buf_offset],buf_group);\n"
" dst[i]=mad(wei[ic],src,dst[i]);\n"
" }\n"
" }\n"
" }\n"
" }\n"
"#ifdef RELU\n"
" dst=fmax(dst,(COMPUTE_FLOAT4)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" dst=clamp(dst,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
"#endif\n"
" if(x == 0){\n"
" uint pad_offset=b*output_b_pitch+f_block*output_fs_pitch+y*output_y_pitch;\n"
" for(int i=0; i<output_pad_left; ++i){\n"
" output[pad_offset+i*output_x_pitch+lid]=0;\n"
" }\n"
" pad_offset += (output_width+output_pad_left)*output_x_pitch;\n"
" for(int i=0; i<output_pad_right; ++i){\n"
" output[pad_offset+i*output_x_pitch+lid]=0;\n"
" }\n"
" }\n"
" if ((f_block+1)*16 >= output_channel) {\n"
" for (int i=0; i<4; i++) {\n"
" if ((f_block*16+lid<output_channel) && (x+i)<output_width)\n"
" output[output_offset+i*output_x_pitch+lid]=(FLOAT)dst[i];\n"
" }\n"
" }\n"
" else\n"
" {\n"
" if (x+4 <= output_width || output_width % 4 == 0) {\n"
" GROUP_WRITE4(output,output_offset,CONVERT_FLOAT4(dst));\n"
" }else{\n"
" for (int i=0; i<output_width % 4; i++) {\n"
" output[output_offset+i*output_x_pitch+lid]=(FLOAT)dst[i];\n"
" }\n"
" }\n"
" }\n"
"}\n"
"__attribute__((intel_reqd_sub_group_size(16)))\n"
"__kernel void conv_2d_buf_subgroup_c1_c16_b8(\n"
" __global FLOAT* input,\n"
" __global FLOAT* output,\n"
" __global FLOAT* weights,\n"
" __global FLOAT* biases,\n"
" __private const int pad_width,\n"
" __private const int pad_height,\n"
" __private const int input_width,\n"
" __private const int input_height,\n"
" __private const int output_width,\n"
" __private const int output_height,\n"
" __private const int output_channel,\n"
" __private const int batch,\n"
" __private const int x_blocks,\n"
" __private const int input_pad_left,\n"
" __private const int input_pad_right,\n"
" __private const int output_pad_left,\n"
" __private const int output_pad_right\n"
")\n"
"{\n"
" const int f_block=get_group_id(1);\n"
" const int lid=get_sub_group_local_id();\n"
" const int b=get_global_id(2);\n"
" const int xy=get_global_id(0);\n"
" const int x=(xy % x_blocks) << 3;\n"
" const int y=(xy/x_blocks);\n"
" const int input_x=x*STRIDE_WIDTH-pad_width;\n"
" const int input_y=y*STRIDE_HEIGHT-pad_height;\n"
" const uint input_x_pitch=1;\n"
" const uint input_y_pitch=input_x_pitch*input_width;\n"
" const uint input_f_pitch=input_y_pitch*input_height;\n"
" const uint input_b_pitch=input_f_pitch*INPUT_CHANNEL;\n"
" const uint input_offset=b*input_b_pitch +\n"
" input_y*input_y_pitch +\n"
" input_x*input_x_pitch;\n"
" const uint output_x_pitch=16;\n"
" const uint output_y_pitch=output_x_pitch*(output_pad_left+output_width+output_pad_right);\n"
" const uint output_fs_pitch=output_y_pitch*output_height;\n"
" const uint output_b_pitch=output_fs_pitch*((output_channel+15)/16);\n"
" \n"
" \n"
" const uint output_offset=b*output_b_pitch +\n"
" f_block*output_fs_pitch +\n"
" y*output_y_pitch +\n"
" (x+output_pad_left)*output_x_pitch;\n"
" const uint filter_isv_pitch=16;\n"
" const uint filter_x_pitch=256;\n"
" const uint filter_y_pitch=filter_x_pitch*FILTER_WIDTH;\n"
" const uint filter_is_pitch=filter_y_pitch*FILTER_HEIGHT;\n"
" const uint filter_os_pitch=filter_is_pitch*((INPUT_CHANNEL+15)/16);\n"
" const uint filter_offset=f_block*filter_os_pitch;\n"
" uint bias_offset=f_block*16;\n"
" COMPUTE_FLOAT8 dst=(COMPUTE_FLOAT8)(GROUP_READ(biases,bias_offset));\n"
" \n"
" FLOAT line_cache[INPUT_CHANNEL*INPUT_BLOCK_SIZE];\n"
" for (int ic=0; ic<INPUT_CHANNEL; ic++)\n"
" {\n"
" __attribute__((opencl_unroll_hint(INPUT_BLOCK_SIZE)))\n"
" for (int i=0; i<INPUT_BLOCK_SIZE; i++)\n"
" {\n"
" const int in_elem=i*16+lid;\n"
" const int xb=in_elem % INPUT_LINE_SIZE;\n"
" const int yb=in_elem/INPUT_LINE_SIZE;\n"
" if (input_y+yb >= 0 && input_y+yb<input_height &&\n"
" input_x+xb >= 0 && input_x+xb<input_width)\n"
" line_cache[ic*INPUT_BLOCK_SIZE+i]=input[input_offset +\n"
" ic*input_f_pitch +\n"
" xb*input_x_pitch +\n"
" yb*input_y_pitch];\n"
" else\n"
" line_cache[ic*INPUT_BLOCK_SIZE+i]=0;\n"
" }\n"
" }\n"
" __attribute__((opencl_unroll_hint(FILTER_HEIGHT)))\n"
" for (int kh=0; kh<FILTER_HEIGHT; kh++)\n"
" {\n"
" __attribute__((opencl_unroll_hint(FILTER_WIDTH)))\n"
" for (int kw=0; kw<FILTER_WIDTH; kw++)\n"
" {\n"
" uint offset=filter_offset+kh*filter_y_pitch+kw*filter_x_pitch;\n"
" \n"
" COMPUTE_FLOAT wei[INPUT_CHANNEL];\n"
" __attribute__((opencl_unroll_hint(INPUT_CHANNEL)))\n"
" for (int ic=0; ic<INPUT_CHANNEL; ic++)\n"
" wei[ic]=GROUP_READ(weights,offset+ic*filter_isv_pitch);\n"
" \n"
" __attribute__((opencl_unroll_hint(8)))\n"
" for (int i=0; i<8; i++)\n"
" {\n"
" const uint buf_offset=(kw*DILATION_WIDTH+STRIDE_WIDTH*i+(kh*DILATION_HEIGHT)*INPUT_LINE_SIZE)/16;\n"
" const uint buf_group=(kw*DILATION_WIDTH+STRIDE_WIDTH*i+(kh*DILATION_HEIGHT)*INPUT_LINE_SIZE) % 16;\n"
" \n"
" for (int ic=0; ic<INPUT_CHANNEL; ic++) {\n"
" COMPUTE_FLOAT src=GROUP_SHUFFLE(line_cache[ic*INPUT_BLOCK_SIZE+buf_offset],buf_group);\n"
" dst[i]=mad(wei[ic],src,dst[i]);\n"
" }\n"
" }\n"
" }\n"
" }\n"
"#ifdef RELU\n"
" dst=fmax(dst,(COMPUTE_FLOAT8)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" dst=clamp(dst,(COMPUTE_FLOAT8)0,(COMPUTE_FLOAT8)6);\n"
"#endif\n"
" if(x == 0){\n"
" uint pad_offset=b*output_b_pitch+f_block*output_fs_pitch+y*output_y_pitch;\n"
" for(int i=0; i<output_pad_left; ++i){\n"
" output[pad_offset+i*output_x_pitch+lid]=0;\n"
" }\n"
" pad_offset += (output_width+output_pad_left)*output_x_pitch;\n"
" for(int i=0; i<output_pad_right; ++i){\n"
" output[pad_offset+i*output_x_pitch+lid]=0;\n"
" }\n"
" }\n"
" if ((f_block+1)*16 >= output_channel) {\n"
" for (int i=0; i<8; i++) {\n"
" if ((f_block*16+lid<output_channel) && (x+i)<output_width)\n"
" output[output_offset+i*output_x_pitch+lid]=(FLOAT)dst[i];\n"
" }\n"
" }\n"
" else\n"
" {\n"
" if (x+8 <= output_width || output_width % 8 == 0) {\n"
" GROUP_WRITE8(output,output_offset,CONVERT_FLOAT8(dst));\n"
" }else{\n"
" for (int i=0; i<output_width % 8; i++) {\n"
" output[output_offset+i*output_x_pitch+lid]=(FLOAT)dst[i];\n"
" }\n"
" }\n"
" }\n"
"}\n"
;
#endif
#endif
}
